{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "import nltk\n",
    "from nltk.corpus import names\n",
    "\n",
    "from tools import show_subtitle"
   ]
  },
  {
   "source": [
    "# Chap6 学习分类文本\n",
    "\n",
    "学习目标：\n",
    "\n",
    "1.  识别出语言数据中可以用于分类的特征\n",
    "2.  构建用于自动执行语言处理任务的语言模型\n",
    "3.  从语言模型中学习与语言相关的知识"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 6.2 有监督分类的应用场景"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 6.2.1 句子分割\n",
    "（标点符号的分类任务，遇到可能会结束句子的符号时，二元判断是否应该断句）"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第一步：获得已经被分割成句子的数据\n",
    "sents = nltk.corpus.treebank_raw.sents()\n",
    "tokens = []\n",
    "boundaries = set()\n",
    "offset = 0\n",
    "# 标注所有句子结束符号的位置\n",
    "for sent in sents:\n",
    "    tokens.extend(sent)  # 句子标识符的合并链表，把所有原始的句子都合并成单词列表\n",
    "    offset += len(sent)  #\n",
    "    boundaries.add(offset - 1)  # 包含所有句子-边界标识符索引的集合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 标点的数据特征\n",
    "def punct_features(tokens, i):\n",
    "    \"\"\"\n",
    "    标点（punctuation）符号的特征\n",
    "    :param tokens: 已经分词的标记集\n",
    "    :type tokens:\n",
    "    :param i: 需要抽取特征的标点符号的位置\n",
    "    :type i:\n",
    "    :return:\n",
    "    :rtype:\n",
    "    \"\"\"\n",
    "    return {\n",
    "            'next-word-capitalized': tokens[i + 1][0].isupper(),\n",
    "            'prevword': tokens[i - 1].lower(),\n",
    "            'punct': tokens[i],\n",
    "            'prev-word-is-one-char': len(tokens[i - 1]) == 1\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "0 ) ({'next-word-capitalized': False, 'prevword': 'nov', 'punct': '.', 'prev-word-is-one-char': False}, False)\n1 ) ({'next-word-capitalized': True, 'prevword': '29', 'punct': '.', 'prev-word-is-one-char': False}, True)\n2 ) ({'next-word-capitalized': True, 'prevword': 'mr', 'punct': '.', 'prev-word-is-one-char': False}, False)\n3 ) ({'next-word-capitalized': True, 'prevword': 'n', 'punct': '.', 'prev-word-is-one-char': True}, False)\n4 ) ({'next-word-capitalized': False, 'prevword': 'group', 'punct': '.', 'prev-word-is-one-char': False}, True)\n5 ) ({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)\n6 ) ({'next-word-capitalized': False, 'prevword': 'conglomerate', 'punct': '.', 'prev-word-is-one-char': False}, True)\n7 ) ({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)\n8 ) ({'next-word-capitalized': True, 'prevword': 'reported', 'punct': '.', 'prev-word-is-one-char': False}, True)\n9 ) ({'next-word-capitalized': True, 'prevword': 'said', 'punct': '.', 'prev-word-is-one-char': False}, True)\n"
     ]
    }
   ],
   "source": [
    "# 第二步：建立标点符号的特征集合\n",
    "feature_sets = [(punct_features(tokens, i), (i in boundaries))\n",
    "                for i in range(1, len(tokens) - 1)\n",
    "                if tokens[i] in '.?!']\n",
    "for i, feature in enumerate(feature_sets):\n",
    "    if i<10:\n",
    "        print(i,\")\",feature)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "0.936026936026936"
      ]
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "# 使用这些特征集，训练和评估一个标点符号分类器\n",
    "size = int(len(feature_sets) * 0.1)\n",
    "train_set, test_set = feature_sets[size:], feature_sets[:size]\n",
    "classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
    "nltk.classify.accuracy(classifier, test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex6-6 基于分类的断句器\n",
    "# ToDo:原理是基于分类器对句子进行分类，但是没提供用于测试的数据\n",
    "def segment_sentences(words):\n",
    "    start = 0\n",
    "    sents = []\n",
    "    for i, word in words:\n",
    "        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:\n",
    "            sents.append(words[start:i + 1])\n",
    "        if start < len(words):\n",
    "            sents.append(words[start:])\n",
    "            return sents"
   ]
  },
  {
   "source": [
    "## 2.2. 识别对话行为类型"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "0 ) <Element 'Post' at 0x000001D0FD2DABD8>\n1 ) <Element 'Post' at 0x000001D0FD2EF368>\n2 ) <Element 'Post' at 0x000001D0FD2EFA48>\n3 ) <Element 'Post' at 0x000001D0FD2EF458>\n4 ) <Element 'Post' at 0x000001D0FD2A3F48>\n5 ) <Element 'Post' at 0x000001D0FD292728>\n6 ) <Element 'Post' at 0x000001D0FD300E08>\n7 ) <Element 'Post' at 0x000001D0FD311B38>\n8 ) <Element 'Post' at 0x000001D0FD2DADB8>\n9 ) <Element 'Post' at 0x000001D0FD2DAE08>\n"
     ]
    }
   ],
   "source": [
    "# 对话的行为类型\n",
    "# Statement, System, Greet, Emotion, ynQuestion, whQuestion, Accept, Bye, Emphasis, Continuer, Reject, yAnswer, nAnswer, Clarify, Other\n",
    "posts = nltk.corpus.nps_chat.xml_posts()[:10000]\n",
    "for i,post in enumerate(posts):\n",
    "    if i<10:\n",
    "        print(i,')',post)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dialogue_act_features(post):\n",
    "    features = {}\n",
    "    for word in nltk.word_tokenize(post):\n",
    "        features['contains({})'.format(word.lower())] = True\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "0 ) ({'contains(now)': True, 'contains(im)': True, 'contains(left)': True, 'contains(with)': True, 'contains(this)': True, 'contains(gay)': True, 'contains(name)': True}, 'Statement')\n1 ) ({'contains(:)': True, 'contains(p)': True}, 'Emotion')\n2 ) ({'contains(part)': True}, 'System')\n3 ) ({'contains(hey)': True, 'contains(everyone)': True}, 'Greet')\n4 ) ({'contains(ah)': True, 'contains(well)': True}, 'Statement')\n5 ) ({'contains(nick)': True, 'contains(:10-19-20suser7)': True}, 'System')\n6 ) ({'contains(10-19-20suser7)': True, 'contains(is)': True, 'contains(a)': True, 'contains(gay)': True, 'contains(name)': True, 'contains(.)': True}, 'Accept')\n7 ) ({'contains(.action)': True, 'contains(gives)': True, 'contains(10-19-20suser121)': True, 'contains(a)': True, 'contains(golf)': True, 'contains(clap)': True, 'contains(.)': True}, 'System')\n8 ) ({'contains(:)': True, 'contains())': True}, 'Emotion')\n9 ) ({'contains(join)': True}, 'System')\n"
     ]
    }
   ],
   "source": [
    "feature_sets = [\n",
    "    (dialogue_act_features(post.text), post.get('class'))\n",
    "    for post in posts\n",
    "]\n",
    "for i,feature in enumerate(feature_sets):\n",
    "    if i<10:\n",
    "        print(i,')',feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[('Statement', 3058),\n",
       " ('System', 2380),\n",
       " ('Greet', 1325),\n",
       " ('Emotion', 1073),\n",
       " ('ynQuestion', 511),\n",
       " ('whQuestion', 503),\n",
       " ('Accept', 224),\n",
       " ('Bye', 191),\n",
       " ('Emphasis', 182),\n",
       " ('Continuer', 161),\n",
       " ('Reject', 151),\n",
       " ('yAnswer', 102),\n",
       " ('nAnswer', 70),\n",
       " ('Clarify', 37),\n",
       " ('Other', 32)]"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "source": [
    "# 常用的对话行为分类\n",
    "classes = [category for _, category in feature_sets]\n",
    "classes_fd = nltk.FreqDist(classes)\n",
    "classes_fd.most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "0.667"
      ]
     },
     "metadata": {},
     "execution_count": 11
    }
   ],
   "source": [
    "# 创建帖子分类器\n",
    "size = int(len(feature_sets) * 0.1)\n",
    "train_set, test_set = feature_sets[size:], feature_sets[:size]\n",
    "classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
    "nltk.classify.accuracy(classifier, test_set)"
   ]
  },
  {
   "source": [
    "### 2.3. 识别文字蕴涵 (Recognizing textual entailment, RTE)\n",
    "-   判断「文本T」内的一个给定片段 是否继承着 另一个叫做「假设」 的文本\n",
    "-   「文本」和「假设」之间的关系并不一定是逻辑蕴涵，而是人们是否会得出结论：文本提供的合理证据证明假设是真实的\n",
    "-   可以把RTE当作一个分类任务，尝试为每一对预测“True”/“False”标签\n",
    "    -   “True”表示保留了蕴涵；“False”表示没有保留蕴涵 "
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex6-7：“认识文字蕴涵”的特征提取器\n",
    "def rte_features(rtepair):\n",
    "    \"\"\"\n",
    "    词（即词类型）作为信息的代理，计数词重叠的程度和假设中有而文本没有的词的程度\n",
    "    特征词包括（命名实体、）\n",
    "    :param rtepair:\n",
    "    :type rtepair:\n",
    "    :return:\n",
    "    :rtype:\n",
    "    \"\"\"\n",
    "    # RTEFeatureExtractor类建立了一个词汇包\n",
    "    # 这个词汇包中的词汇在文本和假设中都有的，并且已经除去了一些停用词\n",
    "    # 计算 重叠性 和 差异性\n",
    "    extractor = nltk.RTEFeatureExtractor(rtepair)\n",
    "    features = {}\n",
    "    features['word_overlap'] = len(extractor.overlap('word'))\n",
    "    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))\n",
    "    features['ne_overlap'] = len(extractor.overlap('ne'))\n",
    "    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "<RTEPair: gid=3-34>\n--------------- >文本中的单词< ---------------\n{'Iran', 'former', 'fledgling', 'Parviz', 'Shanghai', 'representing', 'Co', 'Russia', 'fight', 'Asia', 'that', 'China', 'at', 'four', 'terrorism.', 'Soviet', 'binds', 'SCO', 'operation', 'Organisation', 'association', 'Davudi', 'together', 'central', 'meeting', 'was', 'republics'}\n--------------- >假设中的单词< ---------------\n{'SCO.', 'China', 'member'}\n--------------- >文本和假设中重叠的单词（非实体词）< ---------------\nset()\n--------------- >文本和假设中重叠的实体词< ---------------\n{'China'}\n--------------- >文本和假设中差异的单词（非实体词）< ---------------\n{'member'}\n--------------- >文本和假设中差异的实体词< ---------------\n{'SCO.'}\n"
     ]
    }
   ],
   "source": [
    "# 取出文本-假设对的数据\n",
    "rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]\n",
    "print(rtepair)\n",
    "extractor = nltk.RTEFeatureExtractor(rtepair)\n",
    "show_subtitle(\"文本中的单词\")\n",
    "print(extractor.text_words)\n",
    "show_subtitle(\"假设中的单词\")\n",
    "print(extractor.hyp_words)\n",
    "show_subtitle(\"文本和假设中重叠的单词（非实体词）\")\n",
    "print(extractor.overlap('word'))\n",
    "show_subtitle(\"文本和假设中重叠的实体词\")\n",
    "print(extractor.overlap('ne'))\n",
    "show_subtitle(\"文本和假设中差异的单词（非实体词）\")\n",
    "print(extractor.hyp_extra('word'))\n",
    "show_subtitle(\"文本和假设中差异的实体词\")\n",
    "print(extractor.hyp_extra('ne'))"
   ]
  },
  {
   "source": [
    "## 2.4 扩展到大型的数据集\n",
    "NLTK提供对专业的机器学习软件包的支持，调用它们会比NLTK提供的分类器性能更好\n",
    "\n",
    "注：Scikit-Learn 的运行速度比 NLTK 提供的速度快，但是有的模型质量没有 NLTK 的好"
   ],
   "cell_type": "markdown",
   "metadata": {}
  }
 ]
}